# Load necessary packages
import os
import numpy as np
import pandas as pd
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from kneed import KneeLocator
import matplotlib.pyplot as plt
import gensim.downloader as api
import re
from sklearn.decomposition import PCA
import random


import sys
import pkg_resources

# Get list of currently loaded modules with versions
loaded_modules = [(name, pkg_resources.get_distribution(name).version)
                  for name in sys.modules.keys()
                  if name in pkg_resources.working_set.by_key]

# Sort and display
for name, version in sorted(loaded_modules):
    print(f"{name}=={version}")



# Set Seed for Reproducibility
random.seed(1996)
#############################
# Load Data
df = pd.read_csv('model_text.csv')

# load pretrained word2vec embeddings
wv = api.load('word2vec-google-news-300')

### Estimation for Model in Paper ###
# Isolate issue statements for inputting into model
df_text = df['issue_final']

# Isolate candidate identification
tag = df['candidate_id']

# Create tagged data for document vectors
# Loop over each issue statment
tagged_data = []
for i in range(0, len(df)):
    string = str(df_text[i])
    cand_tag = [str(tag[i])]
    tagged_data.append(TaggedDocument(words = word_tokenize(string), tags = cand_tag))

# Build word2vec model with DBOW archetecture (dm=0) and simultaneous word embedding training (dbow_words = 1)
# Specify window of 6, embedding dimension of 300, and 20 epochs and remove words not appearing 5 times.
### Note -- can only use one core when estimating otherwise seed does not work for replication
model = Doc2Vec(dm = 0, dbow_words = 1, 
                vector_size = 300, window = 6, min_count=5, 
                workers=1, epochs=20, seed = 1996)

# Build Vocab from Tagged Data
model.build_vocab(tagged_data)

# Specify pre-trained embeddings
model.wv.vectors = wv.vectors

# Train Model
model.train(tagged_data, 
            total_examples = model.corpus_count, 
            epochs = model.epochs)

# Create list of candidates with no duplicates 
candidates = tag.drop_duplicates()

# Isolate Candidate Embeddings and Build Empty Matrix
# size of embeddings
M = model.vector_size
# Number of candidates
P = len(candidates)
# Create empty PxM matrix
z = np.zeros((P,M))

# Use a for loop to populate the matrix for each candidate-year combination
for i in range(P):
    z[i,:] = model.dv[i]
    
# Determine Number of Dimensions for PCA by first fitting PCA up to 300 dimensions
pca = PCA(n_components=300)
# Fit on candidate embeddings
pca.fit(z)

# Scree plot to look at cum variance explained
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Find the elbow point to justify number of dimensions
kneedle = KneeLocator(range(1, 301), cumulative_explained_variance, curve='convex', direction='decreasing')
elbow_point = kneedle.elbow
print("Elbow Point:", elbow_point)

# Re-fit PCA to single dimension for WEB Scores
pca_candidate = PCA(n_components = 1)
# Create web score df
Zcand = pd.DataFrame(pca_candidate.fit_transform(z), columns = ['web_score'])
candidates_fixed = candidates.reset_index()
# Adding candidate ID to WEB Scores DF
Zcand['candidate_id'] = candidates_fixed['candidate_id']

# Save Web Scores estimates
Zcand.to_csv('webscore_estimates.csv', index = False)

# Convert full embeddings to vector and save candidate embeddings
df_candidates = pd.DataFrame(z)
df_candidates['candidate_id'] = candidates_fixed['candidate_id']
df_candidates.to_csv('candidate_embeddings.csv', index = False)

# Get list of words in the text that were trained
trained_words = set()
for doc in tagged_data:
    trained_words.update(doc.words)

# Get Word Embeddings for the words in the text that were trained
word_vectors = []
words = []
for word in trained_words:
    if word in model.wv:
        words.append(word)
        word_vectors.append(model.wv[word])

# Save Word Embeddings
df_words = pd.DataFrame(word_vectors)
df_words['word'] = words  
df_words.to_csv('word_embeddings.csv', index = False)





